# Importing Libraries
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
pip install -U dataprep
Requirement already satisfied: dataprep in c:\users\alade\anaconda3\lib\site-packages (0.4.5) Requirement already satisfied: regex<2022.0.0,>=2021.8.3 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (2021.11.10) Requirement already satisfied: varname<0.9.0,>=0.8.1 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (0.8.3) Requirement already satisfied: bokeh<3,>=2 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (2.4.3) Requirement already satisfied: scipy<2.0,>=1.8 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.9.1) Requirement already satisfied: aiohttp<4.0,>=3.6 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (3.8.3) Requirement already satisfied: numpy<2.0,>=1.21 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.21.5) Requirement already satisfied: nltk<4.0.0,>=3.6.7 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (3.7) Requirement already satisfied: pandas<2.0,>=1.1 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.4.4) Requirement already satisfied: jsonpath-ng<2.0,>=1.5 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.5.3) Requirement already satisfied: rapidfuzz<3.0.0,>=2.1.2 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (2.13.2) Requirement already satisfied: dask[array,dataframe,delayed]>=2022.3.0 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (2022.7.0) Requirement already satisfied: wordcloud<2.0,>=1.8 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.8.2.2) Requirement already satisfied: tqdm<5.0,>=4.48 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (4.64.1) Requirement already satisfied: jinja2<3.1,>=3.0 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (3.0.3) Requirement already satisfied: python-crfsuite==0.9.8 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (0.9.8) Requirement already satisfied: python-stdnum<2.0,>=1.16 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.18) Requirement already satisfied: sqlalchemy==1.3.24 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.3.24) Requirement already satisfied: pydantic<2.0,>=1.6 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.10.2) Requirement already satisfied: flask_cors<4.0.0,>=3.0.10 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (3.0.10) Requirement already satisfied: pydot<2.0.0,>=1.4.2 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (1.4.2) Requirement already satisfied: flask<3,>=2 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (2.2.2) Requirement already satisfied: metaphone<0.7,>=0.6 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (0.6) Requirement already satisfied: ipywidgets<8.0,>=7.5 in c:\users\alade\anaconda3\lib\site-packages (from dataprep) (7.6.5) Requirement already satisfied: aiosignal>=1.1.2 in c:\users\alade\anaconda3\lib\site-packages (from aiohttp<4.0,>=3.6->dataprep) (1.3.1) Requirement already satisfied: multidict<7.0,>=4.5 in c:\users\alade\anaconda3\lib\site-packages (from aiohttp<4.0,>=3.6->dataprep) (6.0.2) Requirement already satisfied: attrs>=17.3.0 in c:\users\alade\anaconda3\lib\site-packages (from aiohttp<4.0,>=3.6->dataprep) (21.4.0) Requirement already satisfied: frozenlist>=1.1.1 in c:\users\alade\anaconda3\lib\site-packages (from aiohttp<4.0,>=3.6->dataprep) (1.3.3) Requirement already satisfied: yarl<2.0,>=1.0 in c:\users\alade\anaconda3\lib\site-packages (from aiohttp<4.0,>=3.6->dataprep) (1.8.1) Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\users\alade\anaconda3\lib\site-packages (from aiohttp<4.0,>=3.6->dataprep) (4.0.2) Requirement already satisfied: charset-normalizer<3.0,>=2.0 in c:\users\alade\anaconda3\lib\site-packages (from aiohttp<4.0,>=3.6->dataprep) (2.0.4) Requirement already satisfied: pillow>=7.1.0 in c:\users\alade\anaconda3\lib\site-packages (from bokeh<3,>=2->dataprep) (9.2.0) Requirement already satisfied: PyYAML>=3.10 in c:\users\alade\anaconda3\lib\site-packages (from bokeh<3,>=2->dataprep) (6.0) Requirement already satisfied: typing-extensions>=3.10.0 in c:\users\alade\anaconda3\lib\site-packages (from bokeh<3,>=2->dataprep) (4.3.0) Requirement already satisfied: tornado>=5.1 in c:\users\alade\anaconda3\lib\site-packages (from bokeh<3,>=2->dataprep) (6.1) Requirement already satisfied: packaging>=16.8 in c:\users\alade\anaconda3\lib\site-packages (from bokeh<3,>=2->dataprep) (21.3) Requirement already satisfied: toolz>=0.8.2 in c:\users\alade\anaconda3\lib\site-packages (from dask[array,dataframe,delayed]>=2022.3.0->dataprep) (0.11.2) Requirement already satisfied: fsspec>=0.6.0 in c:\users\alade\anaconda3\lib\site-packages (from dask[array,dataframe,delayed]>=2022.3.0->dataprep) (2022.7.1) Requirement already satisfied: cloudpickle>=1.1.1 in c:\users\alade\anaconda3\lib\site-packages (from dask[array,dataframe,delayed]>=2022.3.0->dataprep) (2.0.0) Requirement already satisfied: partd>=0.3.10 in c:\users\alade\anaconda3\lib\site-packages (from dask[array,dataframe,delayed]>=2022.3.0->dataprep) (1.2.0) Requirement already satisfied: importlib-metadata>=3.6.0 in c:\users\alade\anaconda3\lib\site-packages (from flask<3,>=2->dataprep) (4.11.3) Requirement already satisfied: itsdangerous>=2.0 in c:\users\alade\anaconda3\lib\site-packages (from flask<3,>=2->dataprep) (2.0.1) Requirement already satisfied: click>=8.0 in c:\users\alade\anaconda3\lib\site-packages (from flask<3,>=2->dataprep) (8.0.4) Requirement already satisfied: Werkzeug>=2.2.2 in c:\users\alade\anaconda3\lib\site-packages (from flask<3,>=2->dataprep) (2.2.2) Requirement already satisfied: Six in c:\users\alade\anaconda3\lib\site-packages (from flask_cors<4.0.0,>=3.0.10->dataprep) (1.16.0) Requirement already satisfied: widgetsnbextension~=3.5.0 in c:\users\alade\anaconda3\lib\site-packages (from ipywidgets<8.0,>=7.5->dataprep) (3.5.2) Requirement already satisfied: ipython-genutils~=0.2.0 in c:\users\alade\anaconda3\lib\site-packages (from ipywidgets<8.0,>=7.5->dataprep) (0.2.0) Requirement already satisfied: traitlets>=4.3.1 in c:\users\alade\anaconda3\lib\site-packages (from ipywidgets<8.0,>=7.5->dataprep) (5.1.1) Requirement already satisfied: jupyterlab-widgets>=1.0.0 in c:\users\alade\anaconda3\lib\site-packages (from ipywidgets<8.0,>=7.5->dataprep) (1.0.0) Requirement already satisfied: ipykernel>=4.5.1 in c:\users\alade\anaconda3\lib\site-packages (from ipywidgets<8.0,>=7.5->dataprep) (6.15.2) Requirement already satisfied: ipython>=4.0.0 in c:\users\alade\anaconda3\lib\site-packages (from ipywidgets<8.0,>=7.5->dataprep) (7.31.1) Requirement already satisfied: nbformat>=4.2.0 in c:\users\alade\anaconda3\lib\site-packages (from ipywidgets<8.0,>=7.5->dataprep) (5.5.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\alade\anaconda3\lib\site-packages (from jinja2<3.1,>=3.0->dataprep) (2.1.1) Requirement already satisfied: ply in c:\users\alade\anaconda3\lib\site-packages (from jsonpath-ng<2.0,>=1.5->dataprep) (3.11) Requirement already satisfied: decorator in c:\users\alade\anaconda3\lib\site-packages (from jsonpath-ng<2.0,>=1.5->dataprep) (5.1.1) Requirement already satisfied: joblib in c:\users\alade\anaconda3\lib\site-packages (from nltk<4.0.0,>=3.6.7->dataprep) (1.1.0) Requirement already satisfied: pytz>=2020.1 in c:\users\alade\anaconda3\lib\site-packages (from pandas<2.0,>=1.1->dataprep) (2022.1) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\alade\anaconda3\lib\site-packages (from pandas<2.0,>=1.1->dataprep) (2.8.2) Requirement already satisfied: pyparsing>=2.1.4 in c:\users\alade\anaconda3\lib\site-packages (from pydot<2.0.0,>=1.4.2->dataprep) (3.0.9) Requirement already satisfied: colorama in c:\users\alade\anaconda3\lib\site-packages (from tqdm<5.0,>=4.48->dataprep) (0.4.5) Requirement already satisfied: pure_eval<1.0.0 in c:\users\alade\anaconda3\lib\site-packages (from varname<0.9.0,>=0.8.1->dataprep) (0.2.2) Requirement already satisfied: asttokens<3.0.0,>=2.0.0 in c:\users\alade\anaconda3\lib\site-packages (from varname<0.9.0,>=0.8.1->dataprep) (2.1.0) Requirement already satisfied: executing<0.9.0,>=0.8.3 in c:\users\alade\anaconda3\lib\site-packages (from varname<0.9.0,>=0.8.1->dataprep) (0.8.3) Requirement already satisfied: matplotlib in c:\users\alade\anaconda3\lib\site-packages (from wordcloud<2.0,>=1.8->dataprep) (3.5.2) Requirement already satisfied: zipp>=0.5 in c:\users\alade\anaconda3\lib\site-packages (from importlib-metadata>=3.6.0->flask<3,>=2->dataprep) (3.8.0) Requirement already satisfied: psutil in c:\users\alade\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets<8.0,>=7.5->dataprep) (5.9.0) Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\alade\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets<8.0,>=7.5->dataprep) (7.3.4) Requirement already satisfied: matplotlib-inline>=0.1 in c:\users\alade\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets<8.0,>=7.5->dataprep) (0.1.6) Requirement already satisfied: nest-asyncio in c:\users\alade\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets<8.0,>=7.5->dataprep) (1.5.5) Requirement already satisfied: debugpy>=1.0 in c:\users\alade\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets<8.0,>=7.5->dataprep) (1.5.1) Requirement already satisfied: pyzmq>=17 in c:\users\alade\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets<8.0,>=7.5->dataprep) (23.2.0) Requirement already satisfied: jedi>=0.16 in c:\users\alade\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (0.18.1) Requirement already satisfied: pygments in c:\users\alade\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (2.11.2) Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in c:\users\alade\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (3.0.20) Requirement already satisfied: setuptools>=18.5 in c:\users\alade\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (63.4.1) Requirement already satisfied: backcall in c:\users\alade\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (0.2.0) Requirement already satisfied: pickleshare in c:\users\alade\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (0.7.5) Requirement already satisfied: fastjsonschema in c:\users\alade\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets<8.0,>=7.5->dataprep) (2.16.2) Requirement already satisfied: jupyter_core in c:\users\alade\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets<8.0,>=7.5->dataprep) (4.11.1) Requirement already satisfied: jsonschema>=2.6 in c:\users\alade\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets<8.0,>=7.5->dataprep) (4.16.0) Requirement already satisfied: locket in c:\users\alade\anaconda3\lib\site-packages (from partd>=0.3.10->dask[array,dataframe,delayed]>=2022.3.0->dataprep) (1.0.0) Requirement already satisfied: notebook>=4.4.1 in c:\users\alade\anaconda3\lib\site-packages (from widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (6.4.12) Requirement already satisfied: idna>=2.0 in c:\users\alade\anaconda3\lib\site-packages (from yarl<2.0,>=1.0->aiohttp<4.0,>=3.6->dataprep) (3.3) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\alade\anaconda3\lib\site-packages (from matplotlib->wordcloud<2.0,>=1.8->dataprep) (1.4.2) Requirement already satisfied: cycler>=0.10 in c:\users\alade\anaconda3\lib\site-packages (from matplotlib->wordcloud<2.0,>=1.8->dataprep) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\alade\anaconda3\lib\site-packages (from matplotlib->wordcloud<2.0,>=1.8->dataprep) (4.25.0) Requirement already satisfied: parso<0.9.0,>=0.8.0 in c:\users\alade\anaconda3\lib\site-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (0.8.3) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\users\alade\anaconda3\lib\site-packages (from jsonschema>=2.6->nbformat>=4.2.0->ipywidgets<8.0,>=7.5->dataprep) (0.18.0) Requirement already satisfied: entrypoints in c:\users\alade\anaconda3\lib\site-packages (from jupyter-client>=6.1.12->ipykernel>=4.5.1->ipywidgets<8.0,>=7.5->dataprep) (0.4) Requirement already satisfied: pywin32>=1.0 in c:\users\alade\anaconda3\lib\site-packages (from jupyter_core->nbformat>=4.2.0->ipywidgets<8.0,>=7.5->dataprep) (302) Requirement already satisfied: terminado>=0.8.3 in c:\users\alade\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.13.1) Requirement already satisfied: Send2Trash>=1.8.0 in c:\users\alade\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (1.8.0) Requirement already satisfied: nbconvert>=5 in c:\users\alade\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (6.4.4) Requirement already satisfied: argon2-cffi in c:\users\alade\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (21.3.0) Requirement already satisfied: prometheus-client in c:\users\alade\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.14.1) Requirement already satisfied: wcwidth in c:\users\alade\anaconda3\lib\site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets<8.0,>=7.5->dataprep) (0.2.5) Requirement already satisfied: bleach in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (4.1.0) Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (1.5.0) Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.5.13) Requirement already satisfied: defusedxml in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.7.1) Requirement already satisfied: beautifulsoup4 in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (4.11.1) Requirement already satisfied: mistune<2,>=0.8.1 in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.8.4) Requirement already satisfied: jupyterlab-pygments in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.1.2) Requirement already satisfied: testpath in c:\users\alade\anaconda3\lib\site-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.6.0) Requirement already satisfied: pywinpty>=1.1.0 in c:\users\alade\anaconda3\lib\site-packages (from terminado>=0.8.3->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (2.0.2) Requirement already satisfied: argon2-cffi-bindings in c:\users\alade\anaconda3\lib\site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (21.2.0) Requirement already satisfied: cffi>=1.0.1 in c:\users\alade\anaconda3\lib\site-packages (from argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (1.15.1) Requirement already satisfied: soupsieve>1.2 in c:\users\alade\anaconda3\lib\site-packages (from beautifulsoup4->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (2.3.1) Requirement already satisfied: webencodings in c:\users\alade\anaconda3\lib\site-packages (from bleach->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (0.5.1) Requirement already satisfied: pycparser in c:\users\alade\anaconda3\lib\site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets<8.0,>=7.5->dataprep) (2.21) Note: you may need to restart the kernel to use updated packages.
#Loading the Dataset
Obesity_data = pd.read_csv('Obesitydataset.csv')
from dataprep.eda import plot
plot(Obesity_data)
0%| | 0/755 [00:00<…
| Number of Variables | 17 |
|---|---|
| Number of Rows | 2111 |
| Missing Cells | 0 |
| Missing Cells (%) | 0.0% |
| Duplicate Rows | 24 |
| Duplicate Rows (%) | 1.1% |
| Total Size in Memory | 1.3 MB |
| Average Row Size in Memory | 641.9 B |
| Variable Types |
|
| FCVC is skewed | Skewed |
|---|---|
| NCP is skewed | Skewed |
| CH2O is skewed | Skewed |
| FAF is skewed | Skewed |
| TUE is skewed | Skewed |
| Dataset has 24 (1.14%) duplicate rows | Duplicates |
| FAF has 411 (19.47%) zeros | Zeros |
| TUE has 557 (26.39%) zeros | Zeros |
Number of plots per page:
### At a glace, we can observe the following from the graph:
from dataprep.eda import plot_correlation
plot_correlation(Obesity_data)
| Pearson | Spearman | KendallTau | |
|---|---|---|---|
| Highest Positive Correlation | 0.463 | 0.463 | 0.322 |
| Highest Negative Correlation | -0.297 | -0.298 | -0.214 |
| Lowest Correlation | 0.012 | 0.003 | 0.002 |
| Mean Correlation | 0.056 | 0.063 | 0.045 |
Obesity_data.shape
(2111, 17)
Obesity_data.head()
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 21.0 | 1.62 | 64.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 0.0 | 1.0 | no | Public_Transportation | Normal_Weight |
| 1 | Female | 21.0 | 1.52 | 56.0 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.0 | yes | 3.0 | 0.0 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | Male | 23.0 | 1.80 | 77.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 1.0 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Male | 27.0 | 1.80 | 87.0 | no | no | 3.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 0.0 | Frequently | Walking | Overweight_Level_I |
| 4 | Male | 22.0 | 1.78 | 89.8 | no | no | 2.0 | 1.0 | Sometimes | no | 2.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_II |
Obesity_data.tail()
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2106 | Female | 20.976842 | 1.710730 | 131.408528 | yes | yes | 3.0 | 3.0 | Sometimes | no | 1.728139 | no | 1.676269 | 0.906247 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2107 | Female | 21.982942 | 1.748584 | 133.742943 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.005130 | no | 1.341390 | 0.599270 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2108 | Female | 22.524036 | 1.752206 | 133.689352 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.054193 | no | 1.414209 | 0.646288 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2109 | Female | 24.361936 | 1.739450 | 133.346641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.852339 | no | 1.139107 | 0.586035 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2110 | Female | 23.664709 | 1.738836 | 133.472641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.863513 | no | 1.026452 | 0.714137 | Sometimes | Public_Transportation | Obesity_Type_III |
Obesity_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2111 entries, 0 to 2110 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 2111 non-null object 1 Age 2111 non-null float64 2 Height 2111 non-null float64 3 Weight 2111 non-null float64 4 family_history_with_overweight 2111 non-null object 5 FAVC 2111 non-null object 6 FCVC 2111 non-null float64 7 NCP 2111 non-null float64 8 CAEC 2111 non-null object 9 SMOKE 2111 non-null object 10 CH2O 2111 non-null float64 11 SCC 2111 non-null object 12 FAF 2111 non-null float64 13 TUE 2111 non-null float64 14 CALC 2111 non-null object 15 MTRANS 2111 non-null object 16 NObeyesdad 2111 non-null object dtypes: float64(8), object(9) memory usage: 280.5+ KB
Obesity_data.describe()
| Age | Height | Weight | FCVC | NCP | CH2O | FAF | TUE | |
|---|---|---|---|---|---|---|---|---|
| count | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 |
| mean | 24.312600 | 1.701677 | 86.586058 | 2.419043 | 2.685628 | 2.008011 | 1.010298 | 0.657866 |
| std | 6.345968 | 0.093305 | 26.191172 | 0.533927 | 0.778039 | 0.612953 | 0.850592 | 0.608927 |
| min | 14.000000 | 1.450000 | 39.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 19.947192 | 1.630000 | 65.473343 | 2.000000 | 2.658738 | 1.584812 | 0.124505 | 0.000000 |
| 50% | 22.777890 | 1.700499 | 83.000000 | 2.385502 | 3.000000 | 2.000000 | 1.000000 | 0.625350 |
| 75% | 26.000000 | 1.768464 | 107.430682 | 3.000000 | 3.000000 | 2.477420 | 1.666678 | 1.000000 |
| max | 61.000000 | 1.980000 | 173.000000 | 3.000000 | 4.000000 | 3.000000 | 3.000000 | 2.000000 |
Obesity_data.columns
Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
'CALC', 'MTRANS', 'NObeyesdad'],
dtype='object')
#Counting the unique items in each column heading
for col in Obesity_data.columns:
print(f'{col} -- ({Obesity_data[col].nunique()} unique items):') # print the column name and "unique items" as the next column
print(f'{Obesity_data[col].value_counts()}') #Count the unique items in each column in descending order
print('') # Put a space between each column heading
print('-' * 30) #Print a line to seperate each item
Gender -- (2 unique items):
Male 1068
Female 1043
Name: Gender, dtype: int64
------------------------------
Age -- (1402 unique items):
18.000000 128
26.000000 101
21.000000 96
23.000000 89
19.000000 59
...
23.320120 1
34.243146 1
18.549437 1
36.310292 1
23.664709 1
Name: Age, Length: 1402, dtype: int64
------------------------------
Height -- (1574 unique items):
1.700000 60
1.650000 50
1.600000 43
1.750000 39
1.620000 36
..
1.842943 1
1.706082 1
1.704141 1
1.705813 1
1.738836 1
Name: Height, Length: 1574, dtype: int64
------------------------------
Weight -- (1525 unique items):
80.000000 59
70.000000 43
50.000000 42
75.000000 40
60.000000 37
..
65.140408 1
87.279890 1
67.083121 1
90.138680 1
133.472641 1
Name: Weight, Length: 1525, dtype: int64
------------------------------
family_history_with_overweight -- (2 unique items):
yes 1726
no 385
Name: family_history_with_overweight, dtype: int64
------------------------------
FAVC -- (2 unique items):
yes 1866
no 245
Name: FAVC, dtype: int64
------------------------------
FCVC -- (810 unique items):
3.000000 652
2.000000 600
1.000000 33
2.823179 2
2.214980 2
...
2.927409 1
2.706134 1
2.010684 1
2.300408 1
2.680375 1
Name: FCVC, Length: 810, dtype: int64
------------------------------
NCP -- (635 unique items):
3.000000 1203
1.000000 199
4.000000 69
2.776840 2
3.985442 2
...
3.054899 1
3.118013 1
3.335876 1
3.205009 1
1.089048 1
Name: NCP, Length: 635, dtype: int64
------------------------------
CAEC -- (4 unique items):
Sometimes 1765
Frequently 242
Always 53
no 51
Name: CAEC, dtype: int64
------------------------------
SMOKE -- (2 unique items):
no 2067
yes 44
Name: SMOKE, dtype: int64
------------------------------
CH2O -- (1268 unique items):
2.000000 448
1.000000 211
3.000000 162
2.825629 3
1.636326 3
...
1.622638 1
2.452986 1
2.035954 1
1.944095 1
2.863513 1
Name: CH2O, Length: 1268, dtype: int64
------------------------------
SCC -- (2 unique items):
no 2015
yes 96
Name: SCC, dtype: int64
------------------------------
FAF -- (1190 unique items):
0.000000 411
1.000000 234
2.000000 183
3.000000 75
0.110174 2
...
1.916751 1
0.954459 1
0.340915 1
0.986414 1
1.026452 1
Name: FAF, Length: 1190, dtype: int64
------------------------------
TUE -- (1129 unique items):
0.000000 557
1.000000 292
2.000000 109
0.630866 4
1.119877 3
...
1.343044 1
1.019452 1
0.673408 1
0.997600 1
0.714137 1
Name: TUE, Length: 1129, dtype: int64
------------------------------
CALC -- (4 unique items):
Sometimes 1401
no 639
Frequently 70
Always 1
Name: CALC, dtype: int64
------------------------------
MTRANS -- (5 unique items):
Public_Transportation 1580
Automobile 457
Walking 56
Motorbike 11
Bike 7
Name: MTRANS, dtype: int64
------------------------------
NObeyesdad -- (7 unique items):
Obesity_Type_I 351
Obesity_Type_III 324
Obesity_Type_II 297
Overweight_Level_I 290
Overweight_Level_II 290
Normal_Weight 287
Insufficient_Weight 272
Name: NObeyesdad, dtype: int64
------------------------------
Obesity_data.groupby('NObeyesdad').mean()
| Age | Height | Weight | FCVC | NCP | CH2O | FAF | TUE | |
|---|---|---|---|---|---|---|---|---|
| NObeyesdad | ||||||||
| Insufficient_Weight | 19.783237 | 1.691117 | 49.906330 | 2.480788 | 2.914403 | 1.871281 | 1.250131 | 0.839459 |
| Normal_Weight | 21.738676 | 1.676585 | 62.155052 | 2.334495 | 2.738676 | 1.850174 | 1.247387 | 0.675958 |
| Obesity_Type_I | 25.884941 | 1.693804 | 92.870198 | 2.186050 | 2.431862 | 2.112218 | 0.986748 | 0.676743 |
| Obesity_Type_II | 28.233785 | 1.771795 | 115.305311 | 2.391284 | 2.744555 | 1.877658 | 0.971857 | 0.515186 |
| Obesity_Type_III | 23.495554 | 1.687559 | 120.941114 | 3.000000 | 3.000000 | 2.208493 | 0.664817 | 0.604623 |
| Overweight_Level_I | 23.417674 | 1.687836 | 74.266828 | 2.264631 | 2.504218 | 2.058725 | 1.056796 | 0.612992 |
| Overweight_Level_II | 26.996981 | 1.703748 | 82.085271 | 2.260578 | 2.495529 | 2.025133 | 0.958072 | 0.697275 |
from dataprep.eda import create_report
report=create_report(Obesity_data)
0%| | 0/2397 [00:00<…
#report.show()
report.save('Report')
Report has been saved to Report.html!
report.show_browser()
from dataprep.eda import plot_missing
plot_missing(Obesity_data)
0%| | 0/198 [00:00<…
| Missing Cells | 0 |
|---|---|
| Missing Cells (%) | 0.0% |
| Missing Columns | 0 |
| Missing Rows | 0 |
| Avg Missing Cells per Column | 0.0 |
| Avg Missing Cells per Row | 0.0 |
#Checking for null values
Obesity_data.isnull().sum()
Gender 0 Age 0 Height 0 Weight 0 family_history_with_overweight 0 FAVC 0 FCVC 0 NCP 0 CAEC 0 SMOKE 0 CH2O 0 SCC 0 FAF 0 TUE 0 CALC 0 MTRANS 0 NObeyesdad 0 dtype: int64
If for example Height and Weight has Null values, then we can use the code below to input missing value using either the mean median most_frequent constant
import numpy as np numimputer =SimpleImputer(missing_values=np.nan, strategy="mean" numimputer=numimputer.fit(df[["Height","Weight"]]) new_value=numImputer.transform(df[["Height","Weight"]]) new_value
The "Strategy" will be interchanged with median, most-Frequent or constant if any of these is to be used.
If it is a categorical Variable then most_frequent or constant will be used. Assuming SMOKE and CALC have missing values
import numpy as np catimputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent') catimputer=catimputer.fit(df[["SMOKE","CALC"]]) new_value=catImputer.transform(df[["SMOKE","CALC"]]) new_value
Obesity_data.duplicated().sum()
24
Obesity_data.loc[Obesity_data.duplicated(keep='first'),:]
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 98 | Female | 21.0 | 1.52 | 42.0 | no | no | 3.0 | 1.0 | Frequently | no | 1.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation | Insufficient_Weight |
| 106 | Female | 25.0 | 1.57 | 55.0 | no | yes | 2.0 | 1.0 | Sometimes | no | 2.0 | no | 2.0 | 0.0 | Sometimes | Public_Transportation | Normal_Weight |
| 174 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 179 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 184 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 209 | Female | 22.0 | 1.69 | 65.0 | yes | yes | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 1.0 | 1.0 | Sometimes | Public_Transportation | Normal_Weight |
| 309 | Female | 16.0 | 1.66 | 58.0 | no | no | 2.0 | 1.0 | Sometimes | no | 1.0 | no | 0.0 | 1.0 | no | Walking | Normal_Weight |
| 460 | Female | 18.0 | 1.62 | 55.0 | yes | yes | 2.0 | 3.0 | Frequently | no | 1.0 | no | 1.0 | 1.0 | no | Public_Transportation | Normal_Weight |
| 467 | Male | 22.0 | 1.74 | 75.0 | yes | yes | 3.0 | 3.0 | Frequently | no | 1.0 | no | 1.0 | 0.0 | no | Automobile | Normal_Weight |
| 496 | Male | 18.0 | 1.72 | 53.0 | yes | yes | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 0.0 | 2.0 | Sometimes | Public_Transportation | Insufficient_Weight |
| 527 | Female | 21.0 | 1.52 | 42.0 | no | yes | 3.0 | 1.0 | Frequently | no | 1.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation | Insufficient_Weight |
| 659 | Female | 21.0 | 1.52 | 42.0 | no | yes | 3.0 | 1.0 | Frequently | no | 1.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation | Insufficient_Weight |
| 663 | Female | 21.0 | 1.52 | 42.0 | no | yes | 3.0 | 1.0 | Frequently | no | 1.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation | Insufficient_Weight |
| 763 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 764 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 824 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 830 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 831 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 832 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 833 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 834 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 921 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 922 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
| 923 | Male | 21.0 | 1.62 | 70.0 | no | yes | 2.0 | 1.0 | no | no | 3.0 | no | 1.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_I |
Obesity_data.drop_duplicates(keep='first').shape
(2087, 17)
Obesity_data
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 21.000000 | 1.620000 | 64.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 0.000000 | 1.000000 | no | Public_Transportation | Normal_Weight |
| 1 | Female | 21.000000 | 1.520000 | 56.000000 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.000000 | yes | 3.000000 | 0.000000 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | Male | 23.000000 | 1.800000 | 77.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 1.000000 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Male | 27.000000 | 1.800000 | 87.000000 | no | no | 3.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 0.000000 | Frequently | Walking | Overweight_Level_I |
| 4 | Male | 22.000000 | 1.780000 | 89.800000 | no | no | 2.0 | 1.0 | Sometimes | no | 2.000000 | no | 0.000000 | 0.000000 | Sometimes | Public_Transportation | Overweight_Level_II |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | Female | 20.976842 | 1.710730 | 131.408528 | yes | yes | 3.0 | 3.0 | Sometimes | no | 1.728139 | no | 1.676269 | 0.906247 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2107 | Female | 21.982942 | 1.748584 | 133.742943 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.005130 | no | 1.341390 | 0.599270 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2108 | Female | 22.524036 | 1.752206 | 133.689352 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.054193 | no | 1.414209 | 0.646288 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2109 | Female | 24.361936 | 1.739450 | 133.346641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.852339 | no | 1.139107 | 0.586035 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2110 | Female | 23.664709 | 1.738836 | 133.472641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.863513 | no | 1.026452 | 0.714137 | Sometimes | Public_Transportation | Obesity_Type_III |
2111 rows × 17 columns
plt.figure(figsize=(10,5))
plt.title("Obesity level based on Age")
sns.histplot(x="Age", hue = "NObeyesdad", data=Obesity_data)
plt.show()
plt.figure(figsize=(10,5))
plt.title("Obesity level based on Gender")
sns.histplot(x="Gender", hue = "NObeyesdad", data=Obesity_data)
plt.show()
plt.figure(figsize=(10,5))
plt.title("Obesity level based on family_history_with_overweight")
sns.histplot(x="family_history_with_overweight", hue = "NObeyesdad", data=Obesity_data)
plt.show()
plt.figure(figsize=(10,5))
plt.title("Obesity level based on Transportation System")
sns.histplot(x="MTRANS", hue = "NObeyesdad", data=Obesity_data)
plt.show()
plt.figure(figsize=(10,5))
plt.title("Obesity Attribute based on Monitoring of Calorie Intake")
sns.histplot(x="SCC", hue = "NObeyesdad", data=Obesity_data)
plt.show()
plt.figure(figsize=(10,5))
plt.title("Obesity Attribute based on frequency of consumption of high calorific food")
sns.histplot(x="FAVC", hue = "NObeyesdad", data=Obesity_data)
plt.show()
plt.figure(figsize=(10,5))
plt.title("Obesity Attribute based on frequency of consumption of Alcohol")
sns.histplot(x="CALC", hue = "NObeyesdad", data=Obesity_data)
plt.show()
plt.figure(figsize=(10,5))
plt.title("Obesity Attribute based on frequency of Smoking")
sns.histplot(x="SMOKE", hue = "NObeyesdad", data=Obesity_data)
plt.show()
sns.countplot(Obesity_data['NObeyesdad'])
C:\Users\alade\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='NObeyesdad', ylabel='count'>
#Normalisation can be done either with -Simple Feature Scaling
#Simple Feature Scaling
#Min-Max
#Z-Score
#Below is an example of Simple Feature Scaling
#Obesity_data['Age']= Obesity_data['Age']/Obesity_data['Age'].max()
Obesity_data.Age
0 21.000000
1 21.000000
2 23.000000
3 27.000000
4 22.000000
...
2106 20.976842
2107 21.982942
2108 22.524036
2109 24.361936
2110 23.664709
Name: Age, Length: 2111, dtype: float64
#from sklearn.preprocessing import MinMaxScaler
#scaler=MinMaxScaler()
#Obesity_data = scaler.fit_transform(Obesity_data)
Obesity_data.loc[:,["Age","Gender","family_history_with_overweight","MTRANS","SCC","FAVC","CALC","SMOKE","NObeyesdad"]]
| Age | Gender | family_history_with_overweight | MTRANS | SCC | FAVC | CALC | SMOKE | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 21.000000 | Female | yes | Public_Transportation | no | no | no | no | Normal_Weight |
| 1 | 21.000000 | Female | yes | Public_Transportation | yes | no | Sometimes | yes | Normal_Weight |
| 2 | 23.000000 | Male | yes | Public_Transportation | no | no | Frequently | no | Normal_Weight |
| 3 | 27.000000 | Male | no | Walking | no | no | Frequently | no | Overweight_Level_I |
| 4 | 22.000000 | Male | no | Public_Transportation | no | no | Sometimes | no | Overweight_Level_II |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | 20.976842 | Female | yes | Public_Transportation | no | yes | Sometimes | no | Obesity_Type_III |
| 2107 | 21.982942 | Female | yes | Public_Transportation | no | yes | Sometimes | no | Obesity_Type_III |
| 2108 | 22.524036 | Female | yes | Public_Transportation | no | yes | Sometimes | no | Obesity_Type_III |
| 2109 | 24.361936 | Female | yes | Public_Transportation | no | yes | Sometimes | no | Obesity_Type_III |
| 2110 | 23.664709 | Female | yes | Public_Transportation | no | yes | Sometimes | no | Obesity_Type_III |
2111 rows × 9 columns
#print(Obesity_data['Gender'].nunique())
#Obesity_data['Gender'].value_counts()
Obesity_data["Gender"]=Obesity_data["Gender"].replace(["Male","Female"],[0,1])
Obesity_data.head()
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 21.0 | 1.62 | 64.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 0.0 | 1.0 | no | Public_Transportation | Normal_Weight |
| 1 | 1 | 21.0 | 1.52 | 56.0 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.0 | yes | 3.0 | 0.0 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | 0 | 23.0 | 1.80 | 77.0 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 1.0 | Frequently | Public_Transportation | Normal_Weight |
| 3 | 0 | 27.0 | 1.80 | 87.0 | no | no | 3.0 | 3.0 | Sometimes | no | 2.0 | no | 2.0 | 0.0 | Frequently | Walking | Overweight_Level_I |
| 4 | 0 | 22.0 | 1.78 | 89.8 | no | no | 2.0 | 1.0 | Sometimes | no | 2.0 | no | 0.0 | 0.0 | Sometimes | Public_Transportation | Overweight_Level_II |
#print(Obesity_data['family_history_with_overweight'].nunique())
#Obesity_data['family_history_with_overweight'].value_counts()
Obesity_data.family_history_with_overweight=Obesity_data.family_history_with_overweight.map({"yes":0,"no":1})
print(Obesity_data['MTRANS'].nunique())
Obesity_data['MTRANS'].value_counts()
5
Public_Transportation 1580 Automobile 457 Walking 56 Motorbike 11 Bike 7 Name: MTRANS, dtype: int64
Obesity_data.MTRANS=Obesity_data.MTRANS.map({"Public_Transportation":0,"Automobile":1,"Walking":2,"Motorbike":3,"Bike":4})
print(Obesity_data['SCC'].nunique())
Obesity_data['SCC'].value_counts()
2
no 2015 yes 96 Name: SCC, dtype: int64
Obesity_data.SCC=Obesity_data.SCC.map({"yes":0,"no":1})
print(Obesity_data['FAVC'].nunique())
Obesity_data['FAVC'].value_counts()
2
yes 1866 no 245 Name: FAVC, dtype: int64
Obesity_data.FAVC=Obesity_data.FAVC.map({"yes":0,"no":1})
print(Obesity_data['CAEC'].nunique())
Obesity_data['CAEC'].value_counts()
4
Sometimes 1765 Frequently 242 Always 53 no 51 Name: CAEC, dtype: int64
Obesity_data.CAEC=Obesity_data.CAEC.map({"Sometimes":0,"no":1,"Frequently":2,"Always":3})
print(Obesity_data['CALC'].nunique())
Obesity_data['CALC'].value_counts()
4
Sometimes 1401 no 639 Frequently 70 Always 1 Name: CALC, dtype: int64
Obesity_data.CALC=Obesity_data.CALC.map({"Sometimes":0,"no":1,"Frequently":2,"Always":3})
print(Obesity_data['SMOKE'].nunique())
Obesity_data['SMOKE'].value_counts()
2
no 2067 yes 44 Name: SMOKE, dtype: int64
Obesity_data.SMOKE=Obesity_data.SMOKE.map({"yes":0,"no":1})
print(Obesity_data['NObeyesdad'].nunique())
Obesity_data['NObeyesdad'].value_counts()
7
Obesity_Type_I 351 Obesity_Type_III 324 Obesity_Type_II 297 Overweight_Level_I 290 Overweight_Level_II 290 Normal_Weight 287 Insufficient_Weight 272 Name: NObeyesdad, dtype: int64
Obesity_data.NObeyesdad=Obesity_data.NObeyesdad.map({"Obesity_Type_I":0,"Obesity_Type_II":1,"Obesity_Type_III":2,"Overweight_Level_I":3,"Overweight_Level_II":4,"Normal_Weight":5,"Insufficient_Weight":6})
Obesity_data.head()
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 21.0 | 1.62 | 64.0 | 0 | 1 | 2.0 | 3.0 | 0 | 1 | 2.0 | 1 | 0.0 | 1.0 | 1 | 0 | 5 |
| 1 | 1 | 21.0 | 1.52 | 56.0 | 0 | 1 | 3.0 | 3.0 | 0 | 0 | 3.0 | 0 | 3.0 | 0.0 | 0 | 0 | 5 |
| 2 | 0 | 23.0 | 1.80 | 77.0 | 0 | 1 | 2.0 | 3.0 | 0 | 1 | 2.0 | 1 | 2.0 | 1.0 | 2 | 0 | 5 |
| 3 | 0 | 27.0 | 1.80 | 87.0 | 1 | 1 | 3.0 | 3.0 | 0 | 1 | 2.0 | 1 | 2.0 | 0.0 | 2 | 2 | 3 |
| 4 | 0 | 22.0 | 1.78 | 89.8 | 1 | 1 | 2.0 | 1.0 | 0 | 1 | 2.0 | 1 | 0.0 | 0.0 | 0 | 0 | 4 |
Obesity_data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Gender | 2111.0 | 0.494079 | 0.500083 | 0.00 | 0.000000 | 0.000000 | 1.000000 | 1.00 |
| Age | 2111.0 | 24.312600 | 6.345968 | 14.00 | 19.947192 | 22.777890 | 26.000000 | 61.00 |
| Height | 2111.0 | 1.701677 | 0.093305 | 1.45 | 1.630000 | 1.700499 | 1.768464 | 1.98 |
| Weight | 2111.0 | 86.586058 | 26.191172 | 39.00 | 65.473343 | 83.000000 | 107.430682 | 173.00 |
| family_history_with_overweight | 2111.0 | 0.182378 | 0.386247 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 1.00 |
| FAVC | 2111.0 | 0.116059 | 0.320371 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 1.00 |
| FCVC | 2111.0 | 2.419043 | 0.533927 | 1.00 | 2.000000 | 2.385502 | 3.000000 | 3.00 |
| NCP | 2111.0 | 2.685628 | 0.778039 | 1.00 | 2.658738 | 3.000000 | 3.000000 | 4.00 |
| CAEC | 2111.0 | 0.328754 | 0.775161 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 3.00 |
| SMOKE | 2111.0 | 0.979157 | 0.142893 | 0.00 | 1.000000 | 1.000000 | 1.000000 | 1.00 |
| CH2O | 2111.0 | 2.008011 | 0.612953 | 1.00 | 1.584812 | 2.000000 | 2.477420 | 3.00 |
| SCC | 2111.0 | 0.954524 | 0.208395 | 0.00 | 1.000000 | 1.000000 | 1.000000 | 1.00 |
| FAF | 2111.0 | 1.010298 | 0.850592 | 0.00 | 0.124505 | 1.000000 | 1.666678 | 3.00 |
| TUE | 2111.0 | 0.657866 | 0.608927 | 0.00 | 0.000000 | 0.625350 | 1.000000 | 2.00 |
| CALC | 2111.0 | 0.370441 | 0.550017 | 0.00 | 0.000000 | 0.000000 | 1.000000 | 3.00 |
| MTRANS | 2111.0 | 0.298437 | 0.577618 | 0.00 | 0.000000 | 0.000000 | 1.000000 | 4.00 |
| NObeyesdad | 2111.0 | 2.862151 | 2.009089 | 0.00 | 1.000000 | 3.000000 | 5.000000 | 6.00 |
Obesity_corr=Obesity_data.corr()
Obesity_corr
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Gender | 1.000000 | -0.048394 | -0.618466 | -0.161668 | 0.102512 | 0.064934 | 0.274505 | -0.067600 | 0.077157 | 0.044698 | -0.107930 | -0.102633 | -0.189607 | -0.017269 | -0.045436 | -0.159592 | 0.119238 |
| Age | -0.048394 | 1.000000 | -0.025958 | 0.202560 | -0.205725 | -0.063902 | 0.016291 | -0.043944 | -0.131644 | -0.091987 | -0.045304 | 0.116283 | -0.144938 | -0.296931 | 0.036711 | 0.405192 | -0.301656 |
| Height | -0.618466 | -0.025958 | 1.000000 | 0.463136 | -0.247684 | -0.178364 | -0.038121 | 0.243672 | -0.116778 | -0.055499 | 0.213376 | 0.133753 | 0.294709 | 0.051912 | -0.077203 | 0.083717 | -0.129457 |
| Weight | -0.161668 | 0.202560 | 0.463136 | 1.000000 | -0.496820 | -0.272300 | 0.216125 | 0.107469 | -0.391536 | -0.025746 | 0.200575 | 0.201906 | -0.051436 | -0.071561 | -0.238067 | -0.088426 | -0.721134 |
| family_history_with_overweight | 0.102512 | -0.205725 | -0.247684 | -0.496820 | 1.000000 | 0.208036 | -0.040372 | -0.071370 | 0.303019 | 0.017385 | -0.147437 | -0.185422 | 0.056673 | -0.022943 | -0.014768 | -0.008281 | 0.458095 |
| FAVC | 0.064934 | -0.063902 | -0.178364 | -0.272300 | 0.208036 | 1.000000 | 0.027283 | 0.007000 | 0.187893 | -0.050660 | -0.009719 | -0.190658 | 0.107995 | -0.068417 | 0.137821 | 0.104705 | 0.273007 |
| FCVC | 0.274505 | 0.016291 | -0.038121 | 0.216125 | -0.040372 | 0.027283 | 1.000000 | 0.042216 | 0.008009 | -0.014320 | 0.068461 | -0.071852 | 0.019939 | -0.101135 | -0.078571 | -0.050446 | 0.013033 |
| NCP | -0.067600 | -0.043944 | 0.243672 | 0.107469 | -0.071370 | 0.007000 | 0.042216 | 1.000000 | 0.026022 | -0.007811 | 0.057088 | 0.015624 | 0.129504 | 0.036326 | -0.095262 | 0.052452 | 0.088641 |
| CAEC | 0.077157 | -0.131644 | -0.116778 | -0.391536 | 0.303019 | 0.187893 | 0.008009 | 0.026022 | 1.000000 | -0.057912 | -0.081347 | -0.156784 | 0.046203 | -0.011336 | 0.144413 | 0.017872 | 0.407684 |
| SMOKE | 0.044698 | -0.091987 | -0.055499 | -0.025746 | 0.017385 | -0.050660 | -0.014320 | -0.007811 | -0.057912 | 1.000000 | 0.031995 | 0.047731 | -0.011216 | -0.017613 | -0.022315 | -0.027957 | 0.014750 |
| CH2O | -0.107930 | -0.045304 | 0.213376 | 0.200575 | -0.147437 | -0.009719 | 0.068461 | 0.057088 | -0.081347 | 0.031995 | 1.000000 | -0.008036 | 0.167236 | 0.011965 | -0.037409 | -0.018297 | -0.113323 |
| SCC | -0.102633 | 0.116283 | 0.133753 | 0.201906 | -0.185422 | -0.190658 | -0.071852 | 0.015624 | -0.156784 | 0.047731 | -0.008036 | 1.000000 | -0.074221 | 0.010928 | -0.055562 | -0.025002 | -0.153079 |
| FAF | -0.189607 | -0.144938 | 0.294709 | -0.051436 | 0.056673 | 0.107995 | 0.019939 | 0.129504 | 0.046203 | -0.011216 | 0.167236 | -0.074221 | 1.000000 | 0.058562 | 0.134771 | 0.082912 | 0.132069 |
| TUE | -0.017269 | -0.296931 | 0.051912 | -0.071561 | -0.022943 | -0.068417 | -0.101135 | 0.036326 | -0.011336 | -0.017613 | 0.011965 | 0.010928 | 0.058562 | 1.000000 | 0.112034 | -0.110762 | 0.097669 |
| CALC | -0.045436 | 0.036711 | -0.077203 | -0.238067 | -0.014768 | 0.137821 | -0.078571 | -0.095262 | 0.144413 | -0.022315 | -0.037409 | -0.055562 | 0.134771 | 0.112034 | 1.000000 | 0.108336 | 0.082688 |
| MTRANS | -0.159592 | 0.405192 | 0.083717 | -0.088426 | -0.008281 | 0.104705 | -0.050446 | 0.052452 | 0.017872 | -0.027957 | -0.018297 | -0.025002 | 0.082912 | -0.110762 | 0.108336 | 1.000000 | 0.034242 |
| NObeyesdad | 0.119238 | -0.301656 | -0.129457 | -0.721134 | 0.458095 | 0.273007 | 0.013033 | 0.088641 | 0.407684 | 0.014750 | -0.113323 | -0.153079 | 0.132069 | 0.097669 | 0.082688 | 0.034242 | 1.000000 |
plt.figure(figsize=(12,10))
sns.heatmap(Obesity_corr, annot= True, cmap=plt.cm.CMRmap_r)
plt.title('Heatmap for Obesity_corr Correlation')
plt.show()
from sklearn.model_selection import train_test_split
X = Obesity_data.drop('NObeyesdad', axis = 1)
y = Obesity_data.NObeyesdad
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.3, random_state = 45, stratify=y)
X_train.shape, X_test.shape
((1477, 16), (634, 16))
#This scaling is done by subtracting the mean and dividing by the standard deviation
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train_ScaledObesityData=sc.fit_transform(X_train)
X_test_ScaledObesityData=sc.transform(X_test)
from sklearn import tree
tr = tree.DecisionTreeClassifier(random_state = 45)
tr.fit(X_train_ScaledObesityData, y_train)
y_pred = tr.predict(X_test_ScaledObesityData)
y_pred
array([3, 6, 3, 0, 4, 4, 1, 0, 4, 6, 4, 6, 6, 3, 3, 5, 6, 1, 1, 3, 4, 2,
0, 1, 5, 5, 4, 0, 4, 0, 2, 6, 5, 1, 5, 4, 3, 2, 4, 3, 6, 0, 3, 3,
0, 1, 1, 5, 0, 1, 1, 1, 0, 5, 2, 2, 1, 0, 3, 4, 6, 6, 0, 0, 2, 5,
1, 1, 2, 0, 0, 1, 6, 1, 2, 2, 1, 2, 5, 0, 3, 2, 1, 3, 1, 1, 4, 4,
0, 3, 6, 1, 5, 5, 3, 5, 2, 5, 3, 3, 6, 4, 6, 0, 0, 5, 4, 6, 4, 4,
3, 5, 4, 6, 6, 0, 5, 5, 1, 2, 6, 5, 1, 2, 2, 2, 4, 5, 6, 4, 0, 0,
4, 4, 2, 2, 4, 4, 0, 4, 5, 4, 0, 5, 6, 5, 6, 3, 2, 3, 4, 3, 2, 6,
3, 5, 5, 0, 6, 0, 5, 5, 5, 0, 0, 4, 1, 3, 2, 1, 6, 2, 2, 1, 4, 6,
2, 5, 4, 5, 2, 0, 2, 5, 5, 6, 0, 4, 6, 6, 1, 4, 2, 4, 2, 2, 2, 2,
5, 0, 1, 1, 1, 6, 2, 5, 2, 1, 5, 5, 6, 5, 5, 1, 0, 6, 4, 1, 3, 2,
5, 1, 6, 4, 2, 1, 0, 0, 0, 4, 4, 2, 4, 0, 2, 0, 0, 4, 1, 4, 0, 2,
6, 4, 1, 5, 2, 6, 4, 5, 4, 3, 6, 6, 3, 0, 5, 4, 6, 0, 4, 6, 4, 3,
4, 4, 5, 2, 2, 2, 6, 1, 3, 2, 2, 1, 4, 2, 5, 0, 2, 4, 4, 0, 0, 1,
3, 2, 6, 5, 6, 6, 6, 1, 1, 0, 3, 2, 2, 2, 3, 1, 5, 4, 6, 3, 5, 4,
6, 6, 2, 5, 0, 6, 5, 4, 1, 0, 4, 0, 3, 4, 5, 1, 2, 5, 4, 0, 1, 3,
1, 6, 4, 0, 4, 6, 5, 0, 6, 1, 0, 2, 0, 6, 3, 5, 0, 0, 3, 3, 5, 3,
5, 0, 3, 3, 2, 6, 2, 5, 0, 1, 0, 0, 4, 1, 3, 1, 0, 5, 0, 3, 2, 6,
3, 6, 2, 6, 5, 5, 5, 6, 4, 1, 3, 6, 5, 0, 1, 3, 6, 6, 0, 5, 0, 6,
1, 2, 5, 2, 1, 1, 4, 6, 5, 4, 4, 3, 6, 6, 1, 4, 5, 0, 2, 3, 1, 6,
1, 4, 1, 1, 0, 2, 4, 1, 5, 0, 5, 4, 3, 4, 2, 4, 5, 5, 0, 1, 6, 2,
2, 5, 2, 2, 6, 5, 0, 0, 4, 4, 0, 4, 0, 3, 2, 3, 2, 1, 5, 1, 6, 3,
4, 4, 1, 2, 0, 3, 1, 6, 2, 5, 3, 4, 1, 0, 6, 3, 6, 2, 0, 3, 6, 3,
1, 0, 0, 3, 0, 5, 0, 0, 4, 4, 3, 5, 5, 2, 0, 3, 2, 4, 1, 2, 3, 4,
1, 2, 0, 2, 2, 0, 2, 1, 5, 4, 5, 4, 0, 0, 0, 4, 1, 5, 3, 3, 6, 3,
4, 0, 6, 0, 2, 0, 0, 0, 1, 0, 0, 0, 3, 5, 6, 4, 3, 6, 3, 6, 6, 2,
0, 0, 3, 0, 4, 0, 3, 6, 2, 3, 1, 0, 1, 0, 4, 6, 5, 6, 0, 0, 1, 5,
4, 2, 3, 3, 5, 0, 2, 2, 2, 2, 2, 1, 6, 1, 0, 0, 0, 6, 2, 6, 5, 0,
2, 1, 1, 5, 2, 1, 2, 2, 1, 2, 4, 4, 2, 3, 2, 5, 2, 1, 6, 4, 6, 5,
6, 3, 1, 0, 2, 1, 0, 0, 0, 3, 4, 4, 6, 6, 5, 2, 1, 1], dtype=int64)
from sklearn.metrics import accuracy_score
Obesity_accuracy = accuracy_score(y_test, y_pred)
print('The Accuracy of the scores is: : ', Obesity_accuracy)
The Accuracy of the scores is: : 0.9100946372239748
ObesityAccuracy_train_score = tr.score(X_train_ScaledObesityData, y_train)
ObesityAccuracy_test_score= tr.score(X_test_ScaledObesityData, y_test)
print('The Train score of the Decison Tree algorithm is : ' , ObesityAccuracy_train_score)
print('The Test score of the Decison Tree algorithm is : ' , ObesityAccuracy_test_score)
The Train score of the Decison Tree algorithm is : 1.0 The Test score of the Decison Tree algorithm is : 0.9100946372239748
from sklearn import metrics
Obesity_prediction = 100*tr.score(X_test_ScaledObesityData, y_test)
print('Predictions for Decision Tree Classifier : \n', tr.predict(X_test_ScaledObesityData), '\n Accuracy:', Obesity_prediction, '%')
Predictions for Decision Tree Classifier : [3 6 3 0 4 4 1 0 4 6 4 6 6 3 3 5 6 1 1 3 4 2 0 1 5 5 4 0 4 0 2 6 5 1 5 4 3 2 4 3 6 0 3 3 0 1 1 5 0 1 1 1 0 5 2 2 1 0 3 4 6 6 0 0 2 5 1 1 2 0 0 1 6 1 2 2 1 2 5 0 3 2 1 3 1 1 4 4 0 3 6 1 5 5 3 5 2 5 3 3 6 4 6 0 0 5 4 6 4 4 3 5 4 6 6 0 5 5 1 2 6 5 1 2 2 2 4 5 6 4 0 0 4 4 2 2 4 4 0 4 5 4 0 5 6 5 6 3 2 3 4 3 2 6 3 5 5 0 6 0 5 5 5 0 0 4 1 3 2 1 6 2 2 1 4 6 2 5 4 5 2 0 2 5 5 6 0 4 6 6 1 4 2 4 2 2 2 2 5 0 1 1 1 6 2 5 2 1 5 5 6 5 5 1 0 6 4 1 3 2 5 1 6 4 2 1 0 0 0 4 4 2 4 0 2 0 0 4 1 4 0 2 6 4 1 5 2 6 4 5 4 3 6 6 3 0 5 4 6 0 4 6 4 3 4 4 5 2 2 2 6 1 3 2 2 1 4 2 5 0 2 4 4 0 0 1 3 2 6 5 6 6 6 1 1 0 3 2 2 2 3 1 5 4 6 3 5 4 6 6 2 5 0 6 5 4 1 0 4 0 3 4 5 1 2 5 4 0 1 3 1 6 4 0 4 6 5 0 6 1 0 2 0 6 3 5 0 0 3 3 5 3 5 0 3 3 2 6 2 5 0 1 0 0 4 1 3 1 0 5 0 3 2 6 3 6 2 6 5 5 5 6 4 1 3 6 5 0 1 3 6 6 0 5 0 6 1 2 5 2 1 1 4 6 5 4 4 3 6 6 1 4 5 0 2 3 1 6 1 4 1 1 0 2 4 1 5 0 5 4 3 4 2 4 5 5 0 1 6 2 2 5 2 2 6 5 0 0 4 4 0 4 0 3 2 3 2 1 5 1 6 3 4 4 1 2 0 3 1 6 2 5 3 4 1 0 6 3 6 2 0 3 6 3 1 0 0 3 0 5 0 0 4 4 3 5 5 2 0 3 2 4 1 2 3 4 1 2 0 2 2 0 2 1 5 4 5 4 0 0 0 4 1 5 3 3 6 3 4 0 6 0 2 0 0 0 1 0 0 0 3 5 6 4 3 6 3 6 6 2 0 0 3 0 4 0 3 6 2 3 1 0 1 0 4 6 5 6 0 0 1 5 4 2 3 3 5 0 2 2 2 2 2 1 6 1 0 0 0 6 2 6 5 0 2 1 1 5 2 1 2 2 1 2 4 4 2 3 2 5 2 1 6 4 6 5 6 3 1 0 2 1 0 0 0 3 4 4 6 6 5 2 1 1] Accuracy: 91.00946372239747 %
from sklearn.metrics import classification_report
DecisionTree_report = classification_report(y_test, y_pred)
print(DecisionTree_report)
precision recall f1-score support
0 0.91 0.94 0.93 106
1 0.97 0.94 0.95 89
2 0.99 1.00 0.99 97
3 0.95 0.79 0.86 87
4 0.88 0.93 0.91 87
5 0.80 0.81 0.81 86
6 0.87 0.93 0.90 82
accuracy 0.91 634
macro avg 0.91 0.91 0.91 634
weighted avg 0.91 0.91 0.91 634
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
[[100 3 0 0 3 0 0] [ 4 84 1 0 0 0 0] [ 0 0 97 0 0 0 0] [ 1 0 0 69 6 11 0] [ 5 0 0 1 81 0 0] [ 0 0 0 3 2 70 11] [ 0 0 0 0 0 6 76]]
X_test.iloc[80]
Gender 1.000000 Age 21.000000 Height 1.753578 Weight 77.979170 family_history_with_overweight 0.000000 FAVC 0.000000 FCVC 2.273548 NCP 2.390070 CAEC 0.000000 SMOKE 1.000000 CH2O 1.648404 SCC 1.000000 FAF 0.874643 TUE 1.102696 CALC 0.000000 MTRANS 0.000000 Name: 937, dtype: float64
tr.predict([X_test.iloc[80]])
array([2], dtype=int64)
X_test.iloc[23]
Gender 0.000000 Age 25.124595 Height 1.771510 Weight 113.207124 family_history_with_overweight 0.000000 FAVC 0.000000 FCVC 1.457758 NCP 3.000000 CAEC 0.000000 SMOKE 1.000000 CH2O 2.020249 SCC 1.000000 FAF 1.556709 TUE 0.001330 CALC 0.000000 MTRANS 0.000000 Name: 1585, dtype: float64
tr.predict([X_test.iloc[23]])
array([1], dtype=int64)
Checking_dataframe = pd.DataFrame({'Actual Value' : y_test, 'Predicted Value' : y_pred})
Checking_dataframe
| Actual Value | Predicted Value | |
|---|---|---|
| 920 | 3 | 3 |
| 496 | 6 | 6 |
| 271 | 5 | 3 |
| 78 | 0 | 0 |
| 1021 | 4 | 4 |
| ... | ... | ... |
| 622 | 6 | 6 |
| 437 | 5 | 5 |
| 1908 | 2 | 2 |
| 1779 | 1 | 1 |
| 1739 | 1 | 1 |
634 rows × 2 columns
from sklearn.ensemble import RandomForestClassifier
RandomForest_Obesity = RandomForestClassifier(n_jobs = -1, n_estimators = 100, max_features = 5, random_state = 45)
RandomForest_Obesity.fit(X_train_ScaledObesityData, y_train)
y_pred = RandomForest_Obesity.predict(X_test_ScaledObesityData)
y_pred
array([3, 6, 5, 0, 4, 4, 1, 0, 4, 6, 4, 6, 6, 3, 3, 5, 6, 1, 0, 3, 4, 2,
0, 1, 5, 5, 4, 0, 0, 0, 2, 6, 5, 1, 5, 4, 5, 2, 4, 3, 6, 0, 3, 3,
0, 1, 1, 4, 0, 1, 1, 1, 0, 5, 2, 2, 1, 0, 3, 4, 6, 6, 0, 0, 2, 5,
1, 1, 2, 0, 0, 1, 6, 1, 2, 2, 1, 2, 5, 0, 3, 2, 1, 3, 1, 1, 4, 4,
0, 3, 6, 1, 5, 5, 3, 5, 2, 5, 3, 3, 6, 4, 5, 0, 4, 5, 4, 6, 4, 0,
3, 3, 4, 6, 6, 0, 5, 5, 1, 2, 6, 3, 1, 2, 2, 2, 4, 5, 6, 4, 0, 0,
4, 4, 2, 2, 4, 4, 0, 4, 6, 4, 0, 5, 6, 6, 6, 3, 2, 3, 4, 3, 2, 6,
3, 5, 5, 0, 6, 0, 5, 5, 5, 0, 0, 4, 1, 3, 2, 1, 6, 2, 2, 0, 4, 6,
2, 5, 4, 5, 2, 0, 2, 3, 3, 6, 0, 4, 6, 6, 1, 4, 2, 5, 2, 2, 2, 2,
5, 0, 1, 1, 1, 6, 2, 5, 2, 1, 3, 5, 5, 5, 5, 1, 0, 5, 4, 1, 3, 2,
5, 1, 6, 4, 2, 1, 0, 0, 0, 0, 4, 2, 4, 0, 2, 0, 0, 4, 1, 4, 0, 2,
6, 4, 1, 5, 2, 6, 4, 6, 0, 3, 6, 6, 3, 0, 5, 4, 6, 0, 4, 6, 4, 3,
4, 4, 5, 2, 2, 2, 6, 1, 3, 2, 2, 1, 4, 2, 5, 0, 2, 5, 4, 0, 0, 1,
3, 2, 6, 5, 6, 6, 5, 1, 1, 0, 3, 2, 2, 2, 3, 1, 5, 4, 6, 3, 5, 3,
6, 6, 2, 5, 5, 6, 5, 4, 1, 0, 4, 0, 3, 4, 5, 1, 2, 5, 4, 3, 1, 3,
1, 5, 4, 0, 3, 6, 6, 0, 5, 1, 0, 2, 0, 6, 3, 5, 4, 5, 3, 3, 5, 3,
5, 0, 3, 3, 2, 6, 2, 5, 0, 1, 0, 0, 4, 1, 3, 1, 0, 5, 0, 5, 2, 6,
3, 6, 2, 6, 5, 5, 5, 6, 4, 1, 3, 6, 3, 5, 1, 3, 6, 6, 0, 5, 0, 6,
1, 2, 5, 2, 1, 1, 4, 6, 5, 4, 4, 3, 6, 6, 1, 4, 5, 4, 2, 3, 1, 5,
1, 4, 1, 1, 0, 2, 4, 1, 5, 0, 3, 4, 3, 4, 2, 4, 5, 5, 0, 1, 6, 2,
2, 5, 2, 2, 6, 5, 0, 0, 4, 4, 0, 4, 0, 3, 2, 3, 2, 1, 5, 1, 6, 5,
5, 4, 1, 2, 0, 5, 1, 6, 2, 6, 3, 3, 1, 0, 6, 3, 6, 2, 0, 3, 5, 3,
1, 0, 0, 3, 0, 5, 0, 0, 4, 4, 3, 5, 5, 2, 0, 5, 2, 4, 1, 2, 3, 4,
1, 2, 0, 2, 2, 0, 2, 1, 5, 4, 5, 5, 1, 0, 0, 4, 1, 5, 4, 3, 6, 3,
5, 0, 5, 0, 2, 0, 0, 0, 1, 0, 0, 0, 3, 5, 6, 4, 3, 6, 3, 6, 6, 2,
0, 0, 3, 0, 4, 0, 5, 6, 2, 5, 1, 0, 1, 0, 4, 6, 5, 6, 0, 0, 1, 5,
4, 2, 3, 3, 5, 0, 2, 2, 2, 2, 2, 1, 6, 1, 0, 0, 0, 6, 2, 6, 5, 0,
2, 1, 1, 5, 2, 1, 2, 2, 1, 2, 3, 4, 2, 3, 2, 5, 2, 1, 6, 4, 5, 5,
6, 3, 1, 0, 2, 1, 0, 0, 0, 3, 4, 4, 6, 6, 5, 2, 1, 1], dtype=int64)
from sklearn.metrics import accuracy_score
RandomForest_ObesityAccuracy= accuracy_score(y_test, y_pred)
print("The accuracy of the score is: ", RandomForest_ObesityAccuracy)
The accuracy of the score is: 0.9416403785488959
RandomForest_train_score = RandomForest_Obesity.score(X_train_ScaledObesityData, y_train)
RandomForest_test_score= RandomForest_Obesity.score(X_test_ScaledObesityData, y_test)
print('Predictions for Random Forest Classifier Train Score is : ' , RandomForest_train_score)
print('Predictions for Random Forest Classifier Test Score is : : ' , RandomForest_test_score)
Predictions for Random Forest Classifier Train Score is : 1.0 Predictions for Random Forest Classifier Test Score is : : 0.9416403785488959
from sklearn import metrics
RandomForest_accuracy = 100*RandomForest_Obesity.score(X_test_ScaledObesityData, y_test)
print(' Predictions for Random Forest Classifier : \n', RandomForest_Obesity.predict(X_test_ScaledObesityData), '\n Accuracy:', RandomForest_accuracy, '%')
Predictions for Random Forest Classifier : [3 6 5 0 4 4 1 0 4 6 4 6 6 3 3 5 6 1 0 3 4 2 0 1 5 5 4 0 0 0 2 6 5 1 5 4 5 2 4 3 6 0 3 3 0 1 1 4 0 1 1 1 0 5 2 2 1 0 3 4 6 6 0 0 2 5 1 1 2 0 0 1 6 1 2 2 1 2 5 0 3 2 1 3 1 1 4 4 0 3 6 1 5 5 3 5 2 5 3 3 6 4 5 0 4 5 4 6 4 0 3 3 4 6 6 0 5 5 1 2 6 3 1 2 2 2 4 5 6 4 0 0 4 4 2 2 4 4 0 4 6 4 0 5 6 6 6 3 2 3 4 3 2 6 3 5 5 0 6 0 5 5 5 0 0 4 1 3 2 1 6 2 2 0 4 6 2 5 4 5 2 0 2 3 3 6 0 4 6 6 1 4 2 5 2 2 2 2 5 0 1 1 1 6 2 5 2 1 3 5 5 5 5 1 0 5 4 1 3 2 5 1 6 4 2 1 0 0 0 0 4 2 4 0 2 0 0 4 1 4 0 2 6 4 1 5 2 6 4 6 0 3 6 6 3 0 5 4 6 0 4 6 4 3 4 4 5 2 2 2 6 1 3 2 2 1 4 2 5 0 2 5 4 0 0 1 3 2 6 5 6 6 5 1 1 0 3 2 2 2 3 1 5 4 6 3 5 3 6 6 2 5 5 6 5 4 1 0 4 0 3 4 5 1 2 5 4 3 1 3 1 5 4 0 3 6 6 0 5 1 0 2 0 6 3 5 4 5 3 3 5 3 5 0 3 3 2 6 2 5 0 1 0 0 4 1 3 1 0 5 0 5 2 6 3 6 2 6 5 5 5 6 4 1 3 6 3 5 1 3 6 6 0 5 0 6 1 2 5 2 1 1 4 6 5 4 4 3 6 6 1 4 5 4 2 3 1 5 1 4 1 1 0 2 4 1 5 0 3 4 3 4 2 4 5 5 0 1 6 2 2 5 2 2 6 5 0 0 4 4 0 4 0 3 2 3 2 1 5 1 6 5 5 4 1 2 0 5 1 6 2 6 3 3 1 0 6 3 6 2 0 3 5 3 1 0 0 3 0 5 0 0 4 4 3 5 5 2 0 5 2 4 1 2 3 4 1 2 0 2 2 0 2 1 5 4 5 5 1 0 0 4 1 5 4 3 6 3 5 0 5 0 2 0 0 0 1 0 0 0 3 5 6 4 3 6 3 6 6 2 0 0 3 0 4 0 5 6 2 5 1 0 1 0 4 6 5 6 0 0 1 5 4 2 3 3 5 0 2 2 2 2 2 1 6 1 0 0 0 6 2 6 5 0 2 1 1 5 2 1 2 2 1 2 3 4 2 3 2 5 2 1 6 4 5 5 6 3 1 0 2 1 0 0 0 3 4 4 6 6 5 2 1 1] Accuracy: 94.1640378548896 %
from sklearn.metrics import classification_report
RandomForest_Report = classification_report(y_test, y_pred)
print(RandomForest_Report)
precision recall f1-score support
0 0.95 0.97 0.96 106
1 0.99 0.96 0.97 89
2 0.99 1.00 0.99 97
3 0.97 0.85 0.91 87
4 0.95 0.92 0.94 87
5 0.80 0.93 0.86 86
6 0.95 0.95 0.95 82
accuracy 0.94 634
macro avg 0.94 0.94 0.94 634
weighted avg 0.95 0.94 0.94 634
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
[[103 1 0 0 1 1 0] [ 2 85 1 0 0 1 0] [ 0 0 97 0 0 0 0] [ 0 0 0 74 2 11 0] [ 3 0 0 1 80 3 0] [ 0 0 0 1 1 80 4] [ 0 0 0 0 0 4 78]]
X_test.iloc[20]
Gender 0.000000 Age 34.543563 Height 1.765188 Weight 85.000000 family_history_with_overweight 0.000000 FAVC 0.000000 FCVC 2.694281 NCP 3.000000 CAEC 0.000000 SMOKE 1.000000 CH2O 2.653831 SCC 1.000000 FAF 1.965820 TUE 0.891189 CALC 1.000000 MTRANS 1.000000 Name: 1014, dtype: float64
RandomForest_Obesity.predict([X_test.iloc[20]])
array([1], dtype=int64)
X_test.iloc[40]
Gender 0.00 Age 17.00 Height 1.79 Weight 57.00 family_history_with_overweight 0.00 FAVC 0.00 FCVC 2.00 NCP 4.00 CAEC 2.00 SMOKE 1.00 CH2O 2.00 SCC 1.00 FAF 2.00 TUE 1.00 CALC 1.00 MTRANS 0.00 Name: 310, dtype: float64
RandomForest_Obesity.predict([X_test.iloc[40]])
array([0], dtype=int64)
RfChecking_dataframe = pd.DataFrame({'Actual Value' : y_test, 'Predicted Value' : y_pred})
RfChecking_dataframe
| Actual Value | Predicted Value | |
|---|---|---|
| 920 | 3 | 3 |
| 496 | 6 | 6 |
| 271 | 5 | 5 |
| 78 | 0 | 0 |
| 1021 | 4 | 4 |
| ... | ... | ... |
| 622 | 6 | 6 |
| 437 | 5 | 5 |
| 1908 | 2 | 2 |
| 1779 | 1 | 1 |
| 1739 | 1 | 1 |
634 rows × 2 columns
pd.DataFrame(data={'Model': ['Decision Tree Classifier', 'Random Forest Classifier'], 'Accuracy %': [Obesity_accuracy, RandomForest_ObesityAccuracy]})
| Model | Accuracy % | |
|---|---|---|
| 0 | Decision Tree Classifier | 0.910095 |
| 1 | Random Forest Classifier | 0.941640 |
plt.style.use("seaborn")
x = ['Decision Tree Classifier', 'Random Forest Classifier']
y = [Obesity_accuracy, RandomForest_ObesityAccuracy]
fig, ax = plt.subplots(figsize=(15,8))
sns.barplot(x=x,y=y, palette='inferno')
plt.ylabel('Model Accuracy')
plt.xticks(rotation=40)
plt.title('Compariam of the two models - Accuracy of the Models', fontsize=15, fontname='monospace', y=1.03);